********************************************************************************
*PROJECT: Social interactions of lawmakers and the partisan divide								   
*PURPOSE: Clean scraped MP experience data	   
*UPDATE HISTORY
	*(26 May 2021 by ML): Created
********************************************************************************

cd "$path_pch\Data\raw\MP"
clear all
local mpintfolder = "$path_pch\Data\intermediate\MP"
import delimited using "mp_exper.csv", clear
tempfile mpsess mpcon mpmini mpcomm mpparl mpoth
pause on

// rename
ren mp_id MP_id

// keep raw variables to check later
g exper_raw_1 = mp_exper
g exper_raw_2 = v3
g exper_raw_3 = v4

// extract experience category
g header = strpos(mp_exper,"<h3>")>0
replace mp_exper=subinstr(mp_exper,"<h3>","",.)
replace mp_exper=subinstr(mp_exper,"</h3>","",.)
replace mp_exper=subinstr(mp_exper,":<br/>","",.)
g expertype = mp_exper if header==1
replace expertype = expertype[_n-1] if MP_id==MP_id[_n-1] & expertype==""
replace expertype = "missing" if mp_exper=="missing"
assert expertype!=""
drop if expertype==mp_exper & expertype!="missing"
drop header
replace expertype=trim(expertype)

// translate to english
g expertype_eng = ""
replace expertype_eng="Althingi Member" if expertype=="Alþingismaður" 
replace expertype_eng="Parliamentary Party Office" if expertype=="Embætti þingflokks"
replace expertype_eng="Committee Chairman" if expertype=="Formaður nefnda"
replace expertype_eng="Presidency" if expertype=="Forsetaembætti"
replace expertype_eng="Deputy Member" if expertype=="Varaþingmaður"
replace expertype_eng="Senior President (longest parliamentary term)" if expertype=="Aldursforseti (lengst þingseta)"
replace expertype_eng="President of Iceland" if expertype=="Forseti Íslands"
replace expertype_eng="Guest" if expertype=="Gestur"
replace expertype_eng="Ministerial Post" if expertype=="Ráðherraembætti"
replace expertype_eng="missing" if expertype=="missing"
assert expertype_eng!=""

// extract session number
split mp_exper, parse("<strong>")
split mp_exper2, parse("<")
ren mp_exper21 session_id
drop mp_exper2?
destring session_id, replace
drop mp_exper?
order MP_id expertype expertype_eng session_id

// prep to extract constituency
replace v3 = subinstr(v3,"<sup>","",.)
replace v3 = subinstr(v3,"</sup>","",.)
replace v3 = subinstr(v3,"<span>","",.)
replace v3 = subinstr(v3,"</span>","",.)
replace v3 = trim(v3)

// prep for dates
replace mp_exper = subinstr(mp_exper,`"""',"",.)
replace mp_exper = subinstr(mp_exper,"<span style=font-family: monospace;><strong>","",.)
replace mp_exper = subinstr(mp_exper,"</strong>","",.)
replace mp_exper = subinstr(mp_exper,"<span style=font-family: monospace;>","",.)
replace mp_exper = subinstr(mp_exper,"</span>","",.)
replace mp_exper = trim(mp_exper)
replace mp_exper = substr(mp_exper,5,.) if session_id!=.
replace session_id = -99 if mp_exper=="missing"
replace session_id = session_id[_n-1] if session_id==. & session_id[_n-1]!=. & MP_id==MP_id[_n-1]

// extract dates
replace mp_exper = trim(mp_exper)
split mp_exper, parse(.)
g exper_start_day = substr(mp_exper1,-2,2) if mp_exper!="missing"
g exper_start_month = substr(mp_exper2,-2,2) if mp_exper!="missing"
g exper_start_year = substr(mp_exper3,1,4) if mp_exper!="missing"
g exper_end_day = substr(mp_exper3,-2,2) if mp_exper!="missing"
g exper_end_month = substr(mp_exper4,-2,2) if mp_exper!="missing"
g exper_end_year = substr(mp_exper5,1,4) if mp_exper!="missing"
destring exper_start_*, replace
drop mp_exper?

// fix end date when present
foreach x in day month year {
	assert exper_start_year>=2020 if real(exper_end_`x')==. & exper_end_`x'!=""
	replace exper_end_`x'="" if real(exper_end_`x')==. // these are positions until present
}
destring exper_end_*, replace
replace exper_end_day=. if exper_end_month==. & exper_end_day!=. // picked up the wrong number

// get constituency and other details
split mp_exper, parse(:)
replace mp_exper2 = trim(mp_exper2)

// fix rare case where end date is one day earlier than start date
swapval exper_end_day exper_start_day if mdy(exper_start_month,exper_start_day,exper_start_year)>mdy(exper_end_month,exper_end_day,exper_end_year)

// start and end dates
g exper_sdate = mdy(exper_start_month,exper_start_day,exper_start_year)
g exper_edate = mdy(exper_end_month,exper_end_day,exper_end_year)
format exper_sdate exper_edate %d
assert exper_sdate<=exper_edate

// other checks before clean type-by-type
assert expertype!="" & expertype_eng!="" & MP_id!=. & session_id!=. 
assert exper_start_day!=. & exper_start_month!=. & exper_start_year!=. & ///
	inrange(exper_start_day,1,31) & inrange(exper_start_month,1,12) & inrange(exper_start_year,1875,2021) ///
	if expertype!="missing"
assert inrange(exper_end_day,1,31) & inrange(exper_end_month,1,12) & inrange(exper_end_year,1875,2021) ///
	if exper_end_day!=.	

	
// tempsave list of all MPs and sessions covered
preserve
	keep MP_id session_id
	duplicates drop
	save `mpsess'
restore


// deal with everything type-by-type now
// (1) Althingi Member (for constituencies)
preserve
	keep if expertype_eng=="Althingi Member"
	g constituency_full=mp_exper2
	g constituency=substr(constituency_full,-2,2)
	foreach x in expertype expertype_eng v3 {
		g `x'2 = `x'
		drop `x'
		ren `x'2 `x'
	}
	order expertype expertype_eng MP_id session_id exper_start_day exper_start_month ///
		exper_start_year exper_end_day exper_end_month exper_end_year v3
	drop mp_exper
	assert v4==""
	drop v4
	replace v3=trim(v3)
	ren v3 misc // code for MP name, party, early years: upper vs. lower division
				// * if "supporter of the government" (from government party)
	drop exper_raw_? mp_exper?
	
	assert constituency!="" & constituency_full!=""

	// work out what constituency options mean
	// from session_id=129 (2003 session), only 6 constituencies:
	// (consistent with https://www.althingi.is/thingmenn/kjordaemi/um-kjordaemi/)
	// (used bio pages like https://www.althingi.is/altext/cv/is/?nfaerslunr=386 to check these)
	// NA -- Norðausturkjördæmis -- Northeast
	// NV -- Norðvesturkjördæmis -- Northwest
	// RN -- Reykjavíkurkjördæmis norður -- Reykjavik North
	// RS -- Reykjavíkurkjördæmis suður -- Reykjavik South
	// SU -- Suðurkjördæmis -- Southern
	// SV -- Suðvesturkjördæmis -- Southwest
	g constName=""
	replace constName="Northeast" if constituency=="NA" & session_id>=129
	replace constName="Northwest" if constituency=="NV" & session_id>=129
	replace constName="Reykjavik North" if constituency=="RN" & session_id>=129
	replace constName="Reykjavik South" if constituency=="RS" & session_id>=129
	replace constName="Southern" if constituency=="SU" & session_id>=129
	replace constName="Southwest" if constituency=="SV" & session_id>=129
	
	// from session_id=115-128, 8 constituencies:
	// (consistent with https://en.wikipedia.org/wiki/1999_Icelandic_parliamentary_election)
	// (used bio pages like https://www.althingi.is/altext/cv/is/?nfaerslunr=386 to check these)	
	// AL -- Austurlands -- East
	// NE -- Norðurlands eystra -- Northeast
	// NV -- Norðurlands vestra -- Northwest
	// RN -- Reyknesinga -- same as Reykjanes I think (from wiki)
	// RV -- Reykvíkinga -- Reykjavik
	// SL -- Suðurlands -- Southern
	// VF -- Vestfirðinga -- Westfjords
	// VL -- Vesturlands -- West
	replace constName="East" if constituency=="AL" & session_id>=115 & session_id<=128
	replace constName="Northeast" if constituency=="NE" & session_id>=115 & session_id<=128
	replace constName="Northwest" if constituency=="NV" & session_id>=115 & session_id<=128
	replace constName="Reyknesinga" if constituency=="RN" & session_id>=115 & session_id<=128
	replace constName="Reykjavik" if constituency=="RV" & session_id>=115 & session_id<=128
	replace constName="Southern" if constituency=="SL" & session_id>=115 & session_id<=128
	replace constName="Westfjords" if constituency=="VF" & session_id>=115 & session_id<=128
	replace constName="West" if constituency=="VL" & session_id>=115 & session_id<=128
	
	// constituency type (leave missing before 115 since haven't checked/cleaned properly)
	g reykjavik=((constituency=="RN" | constituency=="RS") & session_id>=129) | constituency=="RV" if session_id>=115 
	g southern=substr(constituency,1,1)=="S" | (constituency=="RN" & session_id<=128) if session_id>=115
	
	// other vars
	g government_party = strpos(misc,"*")>0 // may need more checking before using -- e.g. need to check whether alternate can be governor (hence get *) but not main member
	
	split constituency_full, parse(.)
	drop constituency_full2 constituency_full3
	g constOrder = substr(constituency_full1,-2,2)
	drop constituency_full1
	replace constOrder = substr(constOrder,-1,1) if substr(constOrder,1,1)!="1"
	replace constOrder = "" if constOrder=="m"
	destring constOrder, replace
	
	save `mpcon'
restore


// (2) Ministerial Posts
preserve
	keep if expertype_eng=="Ministerial Post"
	foreach x in expertype expertype_eng v3 {
		g `x'2 = `x'
		drop `x'
		ren `x'2 `x'
	}
	order expertype expertype_eng MP_id session_id exper_start_day exper_start_month ///
		exper_start_year exper_end_day exper_end_month exper_end_year v3
	assert exper_end_day==. if exper_end_year==.
	assert v4==""
	drop v4
	g exper = mp_exper+v3
	drop mp_exper v3
	split exper, parse(:)
	ren exper2 ministerpost
	drop mp_exper? exper exper1 exper_raw_? 
	
	// some one-off cleaning issues
	drop if MP_id==719 & exper_start_year==exper_end_year & exper_start_month==exper_end_month & exper_start_day==exper_end_day
	drop if (MP_id==249 | MP_id==558 | MP_id==142 | MP_id==237) & ministerpost=="" // these are duplicates
	
	assert ministerpost!="" | MP_id==380 // 380 has a couple of missing minister posts (early years), otherwise never missing
		
	save `mpmini'
restore


// (3) Committee Chairs
preserve
	keep if expertype_eng=="Committee Chairman"
	foreach x in expertype expertype_eng v3 {
		g `x'2 = `x'
		drop `x'
		ren `x'2 `x'
	}
	order expertype expertype_eng MP_id session_id exper_start_day exper_start_month ///
		exper_start_year exper_end_day exper_end_month exper_end_year v3
	assert exper_end_day==. if exper_end_year==.
	assert v4=="" & v3==""
	drop v4 v3 exper_raw_? mp_exper?
	
	split mp_exper, parse(<)
	assert mp_exper3=="/a>"
	drop mp_exper3
	split mp_exper2, parse(>)
	ren mp_exper22 committeename
	drop mp_exper mp_exper? mp_exper21
	
	assert committeename!=""
		
	save `mpcomm'	
restore


// (4) Parliamentary Party Office
preserve
	keep if expertype_eng=="Parliamentary Party Office"
	foreach x in expertype expertype_eng v3 {
		g `x'2 = `x'
		drop `x'
		ren `x'2 `x'
	}
	order expertype expertype_eng MP_id session_id exper_start_day exper_start_month ///
		exper_start_year exper_end_day exper_end_month exper_end_year v3
	assert exper_end_day==. if exper_end_year==.
	assert v4=="" & v3==""
	drop v4 v3 exper_raw_? mp_exper mp_exper1
	g parlpost=mp_exper2
	drop mp_exper2
	
	g parlpost_eng=""
	replace parlpost_eng="Chairman of the parliamentary party" if parlpost=="formaður þingflokks"
	replace parlpost_eng="Deputy chairman of the parliamentary party" if parlpost=="varaformaður þingflokks"
	replace parlpost_eng="Lower division" if parlpost=="neðri deild"
	
	assert parlpost!="" & parlpost_eng!=""
		
	save `mpparl'
restore


// (5) Remaining categories -- not likely to use, so less careful cleaning here 
preserve
	drop if expertype_eng=="Parliamentary Party Office" | expertype_eng=="Althingi Member" | ///
			expertype_eng=="Committee Chairman" | expertype_eng=="Ministerial Post"
	drop mp_exper exper_raw_? mp_exper1
	order expertype expertype_eng MP_id session_id exper_sdate exper_edate mp_exper2 v3 v4
	ren mp_exper2 experlabel1
	ren v3 experlabel2
	ren v4 experlabel3
	
	save `mpoth'	
restore


// Now create MP-session-level dataset which can be used directly for balance checks
use `mpcon', clear
sort MP_id session_id exper_sdate, stable
by MP_id session_id: g order = _n
keep if order==1 // keep first observation per session
isid MP_id session_id
keep MP_id session_id constituency_full constituency constName reykjavik southern ///
	government_party constOrder exper_sdate
ren exper_sdate const_sdate
save `mpcon', replace

// now ministerial posts
use `mpmini', clear	
keep MP_id session_id
duplicates drop
g everminister=1
save `mpmini', replace

// now committee chairs
use `mpcomm', clear
keep MP_id session_id
duplicates drop
g evercommitteechair=1
save `mpcomm', replace

// now parliamentary chairs
use `mpparl', clear
g everparldeputy_tmp=parlpost_eng=="Deputy chairman of the parliamentary party"
g everparlchair_tmp=parlpost_eng=="Chairman of the parliamentary party"
bys MP_id session_id: egen everparldeputy = max(everparldeputy_tmp)
bys MP_id session_id: egen everparlchair = max(everparlchair_tmp)
drop *_tmp
keep MP_id session_id everparldeputy everparlchair
duplicates drop
isid MP_id session_id
save `mpparl', replace

// now merge together
use `mpsess', clear

merge 1:1 MP_id session_id using `mpcon', assert(1 3) nogen
merge 1:1 MP_id session_id using `mpmini', assert(1 3) nogen
replace everminister=0 if everminister==. & session_id!=-99
merge 1:1 MP_id session_id using `mpcomm', assert(1 3) nogen
replace evercommitteechair=0 if evercommitteechair==. & session_id!=-99
merge 1:1 MP_id session_id using `mpparl', assert(1 3) nogen
replace everparldeputy=0 if everparldeputy==. & session_id!=-99
replace everparlchair=0 if everparlchair==. & session_id!=-99

// label variables
la var const_sdate "first date as Althingi member this session"
la var constituency_full "constituency details as of first date as Althingi member this session"
la var constituency "constituency code as of first date as Althingi member this session"
la var constName "constituency name (english) as of first date as Althingi member this session"
la var reykjavik "=1 if reykjavik constituency as of first date as Althingi member this session"
la var southern "=1 if southern constituency as of first date as Althingi member this session"
la var government_party "MP comes from a government party"
la var constOrder "order elected in constituency (as per D'Hondt rule) as of first date as Althingi member this session"
la var everminister "=1 if ever hold ministerial post this session"
la var evercommitteechair "=1 if ever chair of committee this session"
la var everparldeputy "=1 if ever deputy chairman of parliamentary party this session"
la var everparlchair "=1 if ever chairman of parliamentary party this session"
la var session_id "=-99 if empty careers page for this MP_id (likely because no MP with this id)"

// now create some predetermined and next session variables
foreach var of varlist ever* {
	sort MP_id session_id
	by MP_id: g `var'Last = `var'[_n-1] if session_id[_n-1]==session_id-1
	replace `var'Last=0 if `var'Last==. & session_id!=-99
	la var `var'Last "`var' but for previous session"
	
	sort MP_id session_id
	by MP_id: g `var'Next = `var'[_n+1] if session_id[_n+1]==session_id+1
	replace `var'Next=0 if `var'Next==. & session_id!=-99
	la var `var'Next "`var' but for next session"	
}

sort MP_id session_id
by MP_id: g everministerPast = (sum(everminister)-everminister)>0 if session_id!=-99
la var everministerPast "=1 if ever held ministerial post prior to this session"

// first session as Althingi member
bys MP_id: egen firstSessionMember_tmp = min(session_id) if const_sdate!=. & session_id!=-99
bys MP_id: egen firstSessionMember = mode(firstSessionMember_tmp)
drop firstSessionMember_tmp
la var firstSessionMember "ID of first session as Althingi member"

// experience proxy
g sessExper = session_id-firstSessionMember if session_id!=-99 & const_sdate!=.
la var sessExper "number of sessions since first became member = session_id-firstSessionMember"

// save
save "`mpintfolder'/MP_session_careers", replace
